# imports
import os
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.figure_factory as ff
import plotly.express as px
import plotly.io as pio
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import iplot
from IPython.display import display
from sklearn.decomposition import PCA
pio.renderers.default = "notebook"
# Load data & observe
df = pd.read_csv(os.path.join(os.getcwd(), 'results.csv'))
print('df head - first 5 el\n')
display(df.head())
print('df tail - last 5 el\n')
display(df.tail())
print('df info - field insight\n')
df.info()
print('df describe - numerical values analysis\n')
display(df.describe())
df head - first 5 el
| id_alg | param_1 | param_2 | param_3 | id_dataset | param_4 | mean_ind | std_ind | ind_0 | ind_1 | ind_2 | ind_3 | ind_4 | ind_5 | ind_6 | ind_7 | ind_8 | ind_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | -1 | 0 | 5 | 1 | 60 | 0.92 | 0.074833 | 0.9 | 0.9 | 0.9 | 1.0 | 1.0 | 0.8 | 1.0 | 0.9 | 1.0 | 0.8 |
| 1 | 2 | -1 | 1 | 5 | 1 | 60 | 0.93 | 0.064031 | 0.9 | 0.9 | 1.0 | 1.0 | 1.0 | 0.8 | 0.9 | 0.9 | 1.0 | 0.9 |
| 2 | 3 | -1 | 0 | 10 | 1 | 60 | 0.93 | 0.064031 | 0.9 | 0.9 | 0.9 | 1.0 | 1.0 | 0.8 | 1.0 | 0.9 | 1.0 | 0.9 |
| 3 | 4 | -1 | 1 | 10 | 1 | 60 | 0.94 | 0.066332 | 0.9 | 0.9 | 1.0 | 1.0 | 1.0 | 0.8 | 1.0 | 0.9 | 1.0 | 0.9 |
| 4 | 5 | 5 | 0 | 0 | 1 | 60 | 0.91 | 0.094340 | 1.0 | 0.9 | 0.9 | 1.0 | 1.0 | 0.8 | 0.9 | 0.9 | 1.0 | 0.7 |
df tail - last 5 el
| id_alg | param_1 | param_2 | param_3 | id_dataset | param_4 | mean_ind | std_ind | ind_0 | ind_1 | ind_2 | ind_3 | ind_4 | ind_5 | ind_6 | ind_7 | ind_8 | ind_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 555 | OBLQ_2 | -1 | - | 10 | 8 | 2 | 0.634503 | 0.107378 | 0.535714 | 0.642857 | 0.571429 | 0.765306 | 0.500000 | 0.673469 | 0.500000 | 0.687500 | 0.84375 | 0.625000 |
| 556 | OBLQ_1 | -1 | - | 5 | 8 | 3 | 0.617560 | 0.162720 | 0.758929 | 0.553571 | 0.428571 | 0.632653 | 0.265306 | 0.744898 | 0.770833 | 0.791667 | 0.68750 | 0.541667 |
| 557 | OBLQ_2 | -1 | - | 10 | 8 | 3 | 0.617560 | 0.162720 | 0.758929 | 0.553571 | 0.428571 | 0.632653 | 0.265306 | 0.744898 | 0.770833 | 0.791667 | 0.68750 | 0.541667 |
| 558 | OBLQ_1 | -1 | - | 5 | 8 | 4 | 0.553550 | 0.163918 | 0.821429 | 0.598214 | 0.428571 | 0.673469 | 0.255102 | 0.581633 | 0.437500 | 0.416667 | 0.56250 | 0.760417 |
| 559 | OBLQ_2 | -1 | - | 10 | 8 | 4 | 0.553550 | 0.163918 | 0.821429 | 0.598214 | 0.428571 | 0.673469 | 0.255102 | 0.581633 | 0.437500 | 0.416667 | 0.56250 | 0.760417 |
df info - field insight <class 'pandas.core.frame.DataFrame'> RangeIndex: 560 entries, 0 to 559 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_alg 560 non-null object 1 param_1 560 non-null int64 2 param_2 560 non-null object 3 param_3 560 non-null int64 4 id_dataset 560 non-null int64 5 param_4 560 non-null int64 6 mean_ind 560 non-null float64 7 std_ind 560 non-null float64 8 ind_0 560 non-null float64 9 ind_1 560 non-null float64 10 ind_2 560 non-null float64 11 ind_3 560 non-null float64 12 ind_4 560 non-null float64 13 ind_5 560 non-null float64 14 ind_6 560 non-null float64 15 ind_7 560 non-null float64 16 ind_8 560 non-null float64 17 ind_9 560 non-null float64 dtypes: float64(12), int64(4), object(2) memory usage: 78.9+ KB df describe - numerical values analysis
| param_1 | param_3 | id_dataset | param_4 | mean_ind | std_ind | ind_0 | ind_1 | ind_2 | ind_3 | ind_4 | ind_5 | ind_6 | ind_7 | ind_8 | ind_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 | 560.000000 |
| mean | 3.857143 | 4.642857 | 4.500000 | 14.000000 | 0.824046 | 0.059808 | 0.825218 | 0.820586 | 0.809152 | 0.817594 | 0.826545 | 0.834645 | 0.825988 | 0.837126 | 0.823115 | 0.820490 |
| std | 4.615555 | 3.520594 | 2.293336 | 23.042312 | 0.140952 | 0.040652 | 0.155786 | 0.163345 | 0.167078 | 0.168294 | 0.172892 | 0.135144 | 0.159632 | 0.154013 | 0.142177 | 0.160281 |
| min | -1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.478274 | 0.000000 | 0.321429 | 0.267857 | 0.267857 | 0.285714 | 0.255102 | 0.428571 | 0.354167 | 0.229167 | 0.437500 | 0.291667 |
| 25% | -1.000000 | 0.000000 | 2.750000 | 2.000000 | 0.706756 | 0.026300 | 0.706296 | 0.706296 | 0.709091 | 0.701667 | 0.717778 | 0.736296 | 0.709091 | 0.708194 | 0.708333 | 0.699808 |
| 50% | 5.000000 | 5.000000 | 4.500000 | 3.000000 | 0.838750 | 0.060000 | 0.841402 | 0.850725 | 0.808712 | 0.837500 | 0.863636 | 0.839161 | 0.852273 | 0.866259 | 0.823427 | 0.850000 |
| 75% | 10.000000 | 5.000000 | 6.250000 | 4.000000 | 0.954507 | 0.086318 | 0.988211 | 0.985310 | 0.973307 | 0.986842 | 0.986842 | 0.972471 | 0.991803 | 0.995066 | 0.978645 | 0.986842 |
| max | 10.000000 | 10.000000 | 8.000000 | 60.000000 | 1.000000 | 0.163918 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
# Check the count by id_alg.
display(df['id_alg'].value_counts())
1 40 2 40 3 40 4 40 5 40 6 40 7 40 8 40 9 40 10 40 11 40 12 40 OBLQ_1 40 OBLQ_2 40 Name: id_alg, dtype: int64
We can see that the # of classes is balanced - 40 of each
# Group by the ID of the algorithm and show the mean values
display(df.groupby(['id_alg']).mean())
# Looking at the mean_ind we can see that Alg 10 has the highest performance, thus it could be the best algorithm.
| param_1 | param_3 | id_dataset | param_4 | mean_ind | std_ind | ind_0 | ind_1 | ind_2 | ind_3 | ind_4 | ind_5 | ind_6 | ind_7 | ind_8 | ind_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id_alg | ||||||||||||||||
| 1 | -1.0 | 5.0 | 4.5 | 14.0 | 0.815138 | 0.063586 | 0.804921 | 0.807001 | 0.798205 | 0.819312 | 0.820425 | 0.837306 | 0.829200 | 0.830134 | 0.815362 | 0.789516 |
| 10 | 10.0 | 5.0 | 4.5 | 14.0 | 0.842060 | 0.059349 | 0.834182 | 0.841861 | 0.823770 | 0.836584 | 0.853260 | 0.845794 | 0.841602 | 0.847975 | 0.850140 | 0.845433 |
| 11 | 5.0 | 5.0 | 4.5 | 14.0 | 0.828888 | 0.060368 | 0.812112 | 0.834594 | 0.817132 | 0.824706 | 0.841335 | 0.836588 | 0.833588 | 0.824836 | 0.843937 | 0.820050 |
| 12 | 10.0 | 5.0 | 4.5 | 14.0 | 0.840358 | 0.056767 | 0.830572 | 0.827377 | 0.829001 | 0.831936 | 0.852121 | 0.846611 | 0.836797 | 0.864294 | 0.845859 | 0.839014 |
| 2 | -1.0 | 5.0 | 4.5 | 14.0 | 0.818678 | 0.059457 | 0.816857 | 0.818574 | 0.815159 | 0.808312 | 0.814471 | 0.820426 | 0.822703 | 0.834455 | 0.817088 | 0.818730 |
| 3 | -1.0 | 10.0 | 4.5 | 14.0 | 0.805701 | 0.063502 | 0.810221 | 0.800797 | 0.775401 | 0.809703 | 0.804358 | 0.817868 | 0.810778 | 0.810586 | 0.808637 | 0.808657 |
| 4 | -1.0 | 10.0 | 4.5 | 14.0 | 0.811240 | 0.060145 | 0.824286 | 0.811038 | 0.803885 | 0.802673 | 0.816197 | 0.811065 | 0.814012 | 0.814178 | 0.803742 | 0.811330 |
| 5 | 5.0 | 0.0 | 4.5 | 14.0 | 0.830560 | 0.061522 | 0.839545 | 0.835643 | 0.813922 | 0.814602 | 0.822455 | 0.859240 | 0.819014 | 0.839608 | 0.830294 | 0.831280 |
| 6 | 10.0 | 0.0 | 4.5 | 14.0 | 0.834642 | 0.058672 | 0.829171 | 0.824449 | 0.821465 | 0.822315 | 0.849859 | 0.840808 | 0.849084 | 0.839015 | 0.833172 | 0.837082 |
| 7 | 5.0 | 0.0 | 4.5 | 14.0 | 0.825373 | 0.056376 | 0.823620 | 0.818323 | 0.812200 | 0.822006 | 0.837632 | 0.836848 | 0.824796 | 0.842272 | 0.827612 | 0.808419 |
| 8 | 10.0 | 0.0 | 4.5 | 14.0 | 0.836382 | 0.054175 | 0.836333 | 0.844586 | 0.809572 | 0.818340 | 0.849193 | 0.836599 | 0.832990 | 0.855460 | 0.840686 | 0.840062 |
| 9 | 5.0 | 5.0 | 4.5 | 14.0 | 0.828987 | 0.058092 | 0.843493 | 0.816266 | 0.798642 | 0.831571 | 0.849008 | 0.833195 | 0.830737 | 0.851781 | 0.805544 | 0.829635 |
| OBLQ_1 | -1.0 | 5.0 | 4.5 | 14.0 | 0.809318 | 0.062653 | 0.823870 | 0.803849 | 0.804888 | 0.802125 | 0.780657 | 0.831343 | 0.809266 | 0.832586 | 0.800767 | 0.803827 |
| OBLQ_2 | -1.0 | 10.0 | 4.5 | 14.0 | 0.809318 | 0.062653 | 0.823870 | 0.803849 | 0.804888 | 0.802125 | 0.780657 | 0.831343 | 0.809266 | 0.832586 | 0.800767 | 0.803827 |
# Observe the histograms for the mean_ind
ids_alg = df['id_alg'].unique()
for id_alg in ids_alg:
df_id = df[df['id_alg'] == id_alg]
hist_id = go.Figure(go.Histogram(x = df_id['mean_ind']))
hist_id.update_layout(title={'text': f'Hist for Alg {id_alg} by mean_ind'})
iplot(hist_id)
After looking at the histograms, we can see that some algorithms have more instances where they better performance (e.g., Alg 12 has 30 data points where the meand_ind was over 0.75, whereas Alg 1 has only 22)
However, some of these performances could've been obtained by using better parameter values.
We will look into that next.
df_alg10 = df[df['id_alg'] == '10']
df_alg10_params = df_alg10[['id_dataset', 'param_1', 'param_2', 'param_3', 'param_4']]
display(df_alg10_params.describe())
display(df_alg10_params.head())
| id_dataset | param_1 | param_3 | param_4 | |
|---|---|---|---|---|
| count | 40.000000 | 40.0 | 40.0 | 40.000000 |
| mean | 4.500000 | 10.0 | 5.0 | 14.000000 |
| std | 2.320477 | 0.0 | 0.0 | 23.315011 |
| min | 1.000000 | 10.0 | 5.0 | 1.000000 |
| 25% | 2.750000 | 10.0 | 5.0 | 2.000000 |
| 50% | 4.500000 | 10.0 | 5.0 | 3.000000 |
| 75% | 6.250000 | 10.0 | 5.0 | 4.000000 |
| max | 8.000000 | 10.0 | 5.0 | 60.000000 |
| id_dataset | param_1 | param_2 | param_3 | param_4 | |
|---|---|---|---|---|---|
| 9 | 1 | 10 | 0 | 5 | 60 |
| 21 | 1 | 10 | 0 | 5 | 1 |
| 33 | 1 | 10 | 0 | 5 | 2 |
| 45 | 1 | 10 | 0 | 5 | 3 |
| 57 | 1 | 10 | 0 | 5 | 4 |
We can see that param_[1, 2, 3] always have the same values (i.e., [10, 0, 5]). Only param_4 has different values so we will use it to make a plot.
def plot_mean_param_dataset(df: pd.DataFrame) -> None:
data = [go.Scatter(x = df['param_4'],
y = df['mean_ind'],
mode = 'markers',
marker = dict(size = 14,
color = df['id_dataset'],
showscale = True,
colorscale = 'Cividis',
colorbar = dict(title='ID dataset'),
opacity = 0.6
))]
layout = go.Layout(title=dict(text='param_4, mean_ind & id_dataset',
y=0.9,
x=0.5,
xanchor= 'center',
yanchor= 'top'),
xaxis={'title':'Param 4'},
yaxis=dict(title = 'Mean_ind'),
hovermode = 'closest',
template = 'plotly_white')
fig = go.Figure(data = data, layout = layout)
iplot(fig)
plot_mean_param_dataset(df_alg10)
Depending on the dataset, Alg10 has different performances. It deals really poorly with dataset of id 8, but it's best on dataset 5. In these cases, we can see that the value of param_4 does not affect the performance, only slightly. Let's also look at Alg1.
df_alg1 = df[df['id_alg'] == '1']
df_alg1_params = df_alg10[['id_dataset', 'param_1', 'param_2', 'param_3', 'param_4']]
display(df_alg1_params.describe())
display(df_alg1_params.head())
| id_dataset | param_1 | param_3 | param_4 | |
|---|---|---|---|---|
| count | 40.000000 | 40.0 | 40.0 | 40.000000 |
| mean | 4.500000 | 10.0 | 5.0 | 14.000000 |
| std | 2.320477 | 0.0 | 0.0 | 23.315011 |
| min | 1.000000 | 10.0 | 5.0 | 1.000000 |
| 25% | 2.750000 | 10.0 | 5.0 | 2.000000 |
| 50% | 4.500000 | 10.0 | 5.0 | 3.000000 |
| 75% | 6.250000 | 10.0 | 5.0 | 4.000000 |
| max | 8.000000 | 10.0 | 5.0 | 60.000000 |
| id_dataset | param_1 | param_2 | param_3 | param_4 | |
|---|---|---|---|---|---|
| 9 | 1 | 10 | 0 | 5 | 60 |
| 21 | 1 | 10 | 0 | 5 | 1 |
| 33 | 1 | 10 | 0 | 5 | 2 |
| 45 | 1 | 10 | 0 | 5 | 3 |
| 57 | 1 | 10 | 0 | 5 | 4 |
Again, only param4 varies.
plot_mean_param_dataset(df_alg1)
We again see that the value of param_4 has only a small impact on the results. The algorithm however performs differently depending on the dataset.
Next, we are going to look at the ind_0, ..., ind_9 metrics. Since there are 10 of them, they will be hard to plot, thus, running PCA seems like a good idea.
inds = [f'ind_{x}' for x in range(10)]
df_inds = df[inds]
np_inds = df_inds.to_numpy()
np_inds = np_inds / np.linalg.norm(np_inds)
pca = PCA(n_components=3)
pca_result = pca.fit_transform(np_inds)
print(pca_result.shape)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
(560, 3) [0.79988495 0.04071741 0.03071345] [0.1685258 0.03802268 0.03302303]
df_pca = pd.DataFrame(pca_result, columns=['PC_1', 'PC_2', 'PC_3'])
df_pca['id_alg'] = df['id_alg']
display(df_pca.head())
display(df_pca.tail())
| PC_1 | PC_2 | PC_3 | id_alg | |
|---|---|---|---|---|
| 0 | -0.004907 | -0.001575 | -0.001652 | 1 |
| 1 | -0.005477 | -0.000933 | -0.000374 | 2 |
| 2 | -0.005413 | -0.001670 | -0.000775 | 3 |
| 3 | -0.005962 | -0.001984 | -0.000948 | 4 |
| 4 | -0.004413 | 0.000148 | -0.001913 | 5 |
| PC_1 | PC_2 | PC_3 | id_alg | |
|---|---|---|---|---|
| 555 | 0.009700 | -0.000481 | 0.003026 | OBLQ_2 |
| 556 | 0.010840 | -0.001824 | 0.001615 | OBLQ_1 |
| 557 | 0.010840 | -0.001824 | 0.001615 | OBLQ_2 |
| 558 | 0.013691 | 0.002102 | 0.006007 | OBLQ_1 |
| 559 | 0.013691 | 0.002102 | 0.006007 | OBLQ_2 |
fig = px.scatter_3d(df_pca, x='PC_1', y='PC_2', z='PC_3', color='id_alg')
fig.show()